import os
import csv
import glob
import functools

from collections import namedtuple
from config import DATA_PATH

# Our list of candidate information will have
#  - nodule status, what we're going to be training the model to classify
#  - diameter, useful for getting a good spread in training
#  - series, to locate the correct CT scan
#  - candidate center, to find the candidate in the larger CT
CandidateInfoTuple = namedtuple(
    "CandidateInfoTuple",
    "isNodule_bool, hasAnnotation_bool, isMal_bool, diameter_mm, series_uid, center_xyz",
)


# in-memory caching
# Since parsing some of the data files can be slow, we'll cache the results
# of this function call in memory. This will come in handy later, because
# we'll be calling this function more often in future. Speeding up our data
# pipeline by carefully applying inmemory or on-disk caching can result in
# some pretty impressive gains in training speed.
@functools.lru_cache(1)
def getCandidateInfoList(requireOnDisk_bool=True):
    # We construct a set with all series_uids that are present on disk.
    # This will let us use the data, even if we haven't downloaded all of
    # the subsets yet.
    mhd_list = glob.glob(os.path.join(DATA_PATH, "LUNA16/subset*/*.mhd"))
    presentOnDisk_set = {os.path.split(p)[-1][:-4] for p in mhd_list}

    # It turns out that several of the candidates listed in candidates.csv are
    # present multiple times. To make it even more interesting, those entries
    # are not exact duplicates of one another. Instead, it seems that the
    # original human annotations weren't sufficiently cleaned before being
    # entered in the file. They might be annotations on the same nodule on
    # different slices, which might even have been beneficial for our classifier.

    # The LUNA dataset is derived from another dataset called the Lung Image
    # Database Consortium image collection (LIDC-IDRI) and includes detailed
    # annotation information from multiple radiologists. We've already done the
    # legwork to get the original LIDC annotations, pull out the nodules,
    # dedupe them, and save them to the file annotations_with_malignancy.csv.

    # wc -l data/annotations_with_malignancy.csv
    # 1183 data/annotations_with_malignancy.csv
    # head data/annotations_with_malignancy.csv
    # seriesuid,coordX,coordY,coordZ,diameter_mm,mal_bool,mal_details,bboxLowX,bboxLowY,bboxLowZ,bboxHighX,bboxHighY,bboxHighZ,len_mal_details
    # 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860,-128.6994211,-175.31927180000002,-298.38750639999995,5.651470635,True,"[4, 2, 4, 2]",-131.89648,-178.25976125,-299.8000036716461,-125.4511675,-172.45898,-296.2000037670135,4

    candidateInfo_list = []
    with open(os.path.join(DATA_PATH, "annotations_with_malignancy.csv"), "r") as f:
        for row in list(csv.reader(f))[1:]:
            series_uid = row[0]

            if series_uid not in presentOnDisk_set and requireOnDisk_bool:
                continue

            annotationCenter_xyz = tuple([float(x) for x in row[1:4]])
            annotationDiameter_mm = float(row[4])
            isMal_bool = {"False": False, "True": True}[row[5]]

            candidateInfo_list.append(
                CandidateInfoTuple(
                    True,
                    True,
                    isMal_bool,
                    annotationDiameter_mm,
                    series_uid,
                    annotationCenter_xyz,
                )
            )

    # wc -l data/candidates.csv
    # 551066 candidates.csv
    #
    # head data/candidates.csv
    # seriesuid,coordX,coordY,coordZ,class
    # 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860,-56.08,-67.85,-311.92,0
    #
    # grep `,1$` data/candidates.csv | wc -l
    # 1351
    #
    # Unfortunately, the location information provided in annotations.csv
    # doesn't always precisely line up with the coordinates in candidates.csv
    # Since the nodule in question has a diameter of 5 mm, both of these points
    # are clearly meant to be the "center" of thenodule, but they don't line up
    # exactly. We are going to do the legwork to make things line up.
    #
    # grep 100225287222365663678666836860 data/annotations.csv
    # 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860,-128.6994211,-175.3192718,-298.3875064,5.651470635
    # 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860,103.7836509,-211.9251487,-227.12125,4.224708481
    #
    # grep '100225287222365663678666836860.*,1$' data/candidates.csv
    # 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860,104.16480444,-211.685591018,-227.011363746,1
    # 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860,-128.94,-175.04,-297.87,1

    with open(os.path.join(DATA_PATH, "candidates.csv"), "r") as f:
        for row in list(csv.reader(f))[1:]:
            series_uid = row[0]

            if series_uid not in presentOnDisk_set and requireOnDisk_bool:
                continue

            isNodule_bool = bool(int(row[4]))
            candidateCenter_xyz = tuple([float(x) for x in row[1:4]])

            if not isNodule_bool:
                candidateInfo_list.append(
                    CandidateInfoTuple(
                        False,
                        False,
                        False,
                        0.0,
                        series_uid,
                        candidateCenter_xyz,
                    )
                )

    # This means we have all of the actual nodule samples starting with the
    # largest first, followed by all of the non-nodule samples (which don't
    # have nodule size information).
    candidateInfo_list.sort(reverse=True)
    return candidateInfo_list


@functools.lru_cache(1)
def getCandidateInfoDict(requireOnDisk_bool=True):
    candidateInfo_list = getCandidateInfoList(requireOnDisk_bool)
    candidateInfo_dict = {}

    for candidateInfo_tup in candidateInfo_list:
        candidateInfo_dict.setdefault(candidateInfo_tup.series_uid, []).append(
            candidateInfo_tup
        )

    return candidateInfo_dict
